library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.0 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(here)
## here() starts at /home/rguerillot/Documents/Travail/Abdou_project/Staph_infection_project/github_analysis/VANANZ_phenotypes
library(adegenet)
## Loading required package: ade4
## Registered S3 method overwritten by 'spdep':
## method from
## plot.mst ape
##
## /// adegenet 2.1.1 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(corrplot)
## corrplot 0.84 loaded
library(RColorBrewer)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(cluster)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = here())
print(paste("My working directory is:" ,here()))
## [1] "My working directory is: /home/rguerillot/Documents/Travail/Abdou_project/Staph_infection_project/github_analysis/VANANZ_phenotypes"
source("Functions/all_functions.R")
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
median_sample_parameters_GC.df <- read.csv("Ideas_Grant_2020_analysis/Processed_data/Growth_curves/processed_median_parameters_GC.csv") %>%
filter(strain_group != "CONTROL") %>%
select_if(grepl("OD", names(.)) | grepl("sample_id", names(.))) %>%
select(-end_point_timepoint_OD)
median_sample_parameters_PI.df <- read.csv("Ideas_Grant_2020_analysis/Processed_data/PI_curves/processed_median_parameters_PI.csv") %>%
filter(strain_group != "CONTROL") %>%
select(-end_point_timepoint_death)
median_sample_parameters_all.df <- merge(median_sample_parameters_GC.df, median_sample_parameters_PI.df, by = "sample_id")
# correct sample metadata (mortality annotation for last isolate (not index isolate) + all isolates from survived patient = survived)
sample_meta.df <- read.csv("Ideas_Grant_2020_analysis/Raw_data/strain_metadata_corrected_mortality_with_controls.csv")
median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
select_if(grepl("OD", names(.)) | grepl("death", names(.)) | grepl("sample_id", names(.))) %>%
merge(., sample_meta.df, by = "sample_id")%>%
mutate(ST_simp = fct_lump(ST, n = 5))
median_param_dat.df <- median_sample_parameters_all.df %>%
select_if(grepl("OD", names(.)) | grepl("death", names(.)))
for (var in grep("OD", colnames(median_sample_parameters_all.df), value = T)) {
t<-ggplot(median_sample_parameters_all.df, aes_string(x = var))+
geom_density(aes(fill = mortality), alpha = .5)
print(t)
}
for (var in grep("death", colnames(median_sample_parameters_all.df), value = T)) {
t<-ggplot(median_sample_parameters_all.df, aes_string(x = var))+
geom_density(aes(fill = mortality), alpha = .5)
print(t)
}
for (var in grep("OD", colnames(median_sample_parameters_all.df), value = T)) {
t <- ggviolin(data = median_sample_parameters_all.df,
y = var,
x = "mortality",
fill = "mortality") +
theme_bw() +
theme(legend.position = "none")+
stat_compare_means(ref.group ="Survived",
method = "wilcox.test",
label = "p.signif")
print(t)
}
for (var in grep("death", colnames(median_sample_parameters_all.df), value = T)) {
t <- ggviolin(data = median_sample_parameters_all.df,
y = var,
x = "mortality",
fill = "mortality") +
theme_bw() +
theme(legend.position = "none")+
stat_compare_means(ref.group ="Survived",
method = "wilcox.test",
label = "p.signif")
print(t)
}
Rationale: It seems that for several GC PI parameters there more variance in the died group Levene test is designed to test for this http://www.sthda.com/english/wiki/compare-multiple-sample-variances-in-r
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
for (i in 1:length(colnames(median_param_dat.df))) {
print( paste0("Current tested variable is ", colnames(median_param_dat.df)[i]))
print(leveneTest(median_param_dat.df[[i]], median_sample_parameters_all.df$mortality))
print("")
print("")
}
## [1] "Current tested variable is time_of_max_rate_OD"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 28.141 2.595e-07 ***
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is max_rate_OD"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 35.798 8.063e-09 ***
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is doubling_time_OD"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 12.857 0.0004085 ***
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is AUC_OD"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 70.381 4.5e-15 ***
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_max_OD"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 7.7239 0.005888 **
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is max_OD"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 32.74 3.178e-08 ***
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_min_OD"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.0542 0.8161
## 236
## [1] ""
## [1] ""
## [1] "Current tested variable is min_OD"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 5.8673 0.01618 *
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is end_point_OD"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 18.218 2.854e-05 ***
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is AUC_death"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.9854 0.3219
## 236
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_max_death"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 1.8397 0.1763
## 236
## [1] ""
## [1] ""
## [1] "Current tested variable is max_death"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.8513 0.3571
## 236
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_min_death"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 22.525 3.596e-06 ***
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is min_death"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 6.8658 0.009356 **
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_max_rate_death"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 1.4615 0.2279
## 236
## [1] ""
## [1] ""
## [1] "Current tested variable is max_rate_death"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.5142 0.474
## 236
## [1] ""
## [1] ""
## [1] "Current tested variable is doubling_time_death"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.033 0.856
## 236
## [1] ""
## [1] ""
## [1] "Current tested variable is end_point_death"
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 4.6914 0.03132 *
## 236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
fligner.test(max_rate_OD ~ mortality, data = median_sample_parameters_all.df)
##
## Fligner-Killeen test of homogeneity of variances
##
## data: max_rate_OD by mortality
## Fligner-Killeen:med chi-squared = 36.209, df = 1, p-value =
## 1.773e-09
M <-cor(median_param_dat.df, method = "pearson")
corrplot(M, type="upper")
M <-cor(median_param_dat.df, method = "kendall")
corrplot(M, type="upper")
M <-cor(median_param_dat.df, method = "spearman")
corrplot(M, type="upper")
my_cols <- c("#00AFBB", "#E7B800")#, "#FC4E07")
pairs(median_param_dat.df, pch = 19, cex = 0.5,
col = my_cols[median_sample_parameters_all.df$mortality],
lower.panel=NULL)
pca.df <- median_param_dat.df
row.names(pca.df) <- median_sample_parameters_all.df$sample_id
pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T)
fviz_screeplot(pca1, addlabels = T)
# mortality
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df$mortality,
col.circle = median_sample_parameters_all.df$mortality ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df$mortality,
col.circle = median_sample_parameters_all.df$mortality ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
#ST
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df$ST_simp,
col.circle = median_sample_parameters_all.df$ST_simp ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df$ST_simp,
col.circle = median_sample_parameters_all.df$ST_simp ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
fviz_contrib(pca1, choice = "var", axes = 1)
fviz_contrib(pca1, choice = "var", axes = 2)
fviz_contrib(pca1, choice = "var", axes = 1:4)
pca.df <- median_param_dat.df %>%
# select(AUC_death, max_death, end_point_OD, AUC_OD, max_OD, end_point_death, max_rate_death, max_rate_OD) %>%
# select(AUC_death, AUC_OD, max_OD, max_rate_death, max_rate_OD, doubling_time_OD) %>%
filter(!is.na(median_sample_parameters_all.df$mortality))
#row.names(pca.df) <- median_sample_parameters_all.df$sample_id
pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T, center = T)
fviz_screeplot(pca1, addlabels = T)
# mortality
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .75,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .75,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .75,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .75,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
pca.df <- median_param_dat.df %>%
filter(!is.na(median_sample_parameters_all.df$mortality))
row.names(pca.df) <- median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$sample_id
# All vs All
dapc1 <- adegenet::dapc(x = pca.df,
grp = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality == "Died",
n.pca = 6,
n.da = 2,
scale = T,
center = T)
summary(dapc1)
## $n.dim
## [1] 1
##
## $n.pop
## [1] 2
##
## $assign.prop
## [1] 0.8571429
##
## $assign.per.pop
## FALSE TRUE
## 0.9714286 0.5396825
##
## $prior.grp.size
##
## FALSE TRUE
## 175 63
##
## $post.grp.size
##
## FALSE TRUE
## 199 39
scatter.dapc(dapc1)
loadingplot(dapc1$var.contr)
dapc0 <-data.frame(comparison = "All vs ALL",
var = row.names(dapc1$var.contr),
contrib = as.character(dapc1$var.contr),
row.names = NULL)
pca.df <- pca.df %>%
select(time_of_max_OD, time_of_min_death, time_of_max_rate_OD, time_of_max_rate_death, max_rate_OD )
pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T, center = T)
fviz_screeplot(pca1, addlabels = T)
# mortality
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
pca.df <- median_param_dat.df %>%
select(time_of_max_OD, time_of_min_death, time_of_max_rate_OD, time_of_max_rate_death, max_rate_OD )
row.names(pca.df) <- median_sample_parameters_all.df$sample_id
pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T)
fviz_screeplot(pca1, addlabels = T)
# mortality
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df$mortality,
col.circle = median_sample_parameters_all.df$mortality ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df$mortality,
col.circle = median_sample_parameters_all.df$mortality ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)
library(randomForest)
# add sample_id number (ID) to median_sample_parameters_all.df
median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
mutate(ID = row.names(.))
rf.df <- median_sample_parameters_all.df %>%
select_if(grepl("OD", names(.)) | grepl("death", names(.))) %>%
mutate(mortality = median_sample_parameters_all.df$mortality) %>%
mutate(ID = median_sample_parameters_all.df$ID) %>%
mutate(ST_simp = median_sample_parameters_all.df$ST_simp)
# can filter or sample here
rf.df <- rf.df %>%
filter(!is.na(mortality))
rownames(rf.df) <- rf.df$ID
set.seed(100)
# train <- sample(nrow(EM.rf.df), 0.7*nrow(EM.rf.df), replace = FALSE)
# TrainSet <- EM.rf.df[train,]
# ValidSet <- EM.rf.df[-train,]
pred_rf.df <- rf.df %>% select(-mortality, -ID, -ST_simp)
resp_rf.vec <- rf.df$mortality
model1 <- randomForest(x = pred_rf.df, y= resp_rf.vec, data = EM.rf.df, importance = TRUE, ntree = 100)
model1
##
## Call:
## randomForest(x = pred_rf.df, y = resp_rf.vec, ntree = 100, importance = TRUE, data = EM.rf.df)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 11.34%
## Confusion matrix:
## Died Survived class.error
## Died 41 22 0.34920635
## Survived 5 170 0.02857143
varImpPlot(model1)
# Predicting on Validation set
predValid <- predict(model1, rf.df, type = "class")
predValid <- as.data.frame(predValid) %>%
mutate(ID = rownames(.)) %>%
rename(Prediction = predValid) %>%
merge(., rf.df %>% select(ID, mortality), by = "ID")%>%
droplevels() %>%
mutate(Accurate = ifelse(Prediction == mortality, T, F))
# Predict class proba
predValid_prob <- predict(model1, rf.df, type="prob")%>%
as.data.frame(.) %>%
mutate(ID = rownames(.)) %>%
merge(., predValid)
# Checking classification accuracy
mean(predValid$Accurate)
## [1] 1
table(predValid$Prediction, predValid$mortality)
##
## Died Survived
## Died 63 0
## Survived 0 175
table(predValid$mortality, predValid$Accurate)
##
## TRUE
## Died 63
## Survived 175
# Keep n best predicted for each strain
best_pred.df <- predValid_prob %>%
filter(Accurate == T) %>%
mutate(Accurate_proba = pmax(Died, Survived)) %>%
group_by(mortality) %>%
#filter(Accurate_proba > 0.9)
top_frac(.5, Accurate_proba)
count(best_pred.df, mortality)
## # A tibble: 2 x 2
## # Groups: mortality [2]
## mortality n
## <fct> <int>
## 1 Died 31
## 2 Survived 103
pca_meta.df3 <- rf.df %>%
filter(ID %in% best_pred.df$ID)
pca.df3 <- pca_meta.df3 %>%
select(-ID, -mortality, -ST_simp)
pca3 <- dudi.pca(df = pca.df3, scannf = F, nf = 5, scale = T)
fviz_screeplot(pca3, addlabels = T)
fviz_pca_biplot(pca3,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df3$mortality,
col.circle = pca_meta.df3$mortality,
title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
repel = T,
pointshape = 16)
fviz_pca_biplot(pca3,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df3$mortality,
col.circle = pca_meta.df3$mortality,
title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
repel = T,
pointshape = 16)
fviz_pca_biplot(pca3,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .8,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df3$ST_simp,
col.circle = pca_meta.df3$ST_simp,
title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
repel = T,
pointshape = 16)
## Too few points to calculate an ellipse
fviz_pca_biplot(pca3,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .95,
addEllipses = T,
#ellipse.level = .99,
col.ind = pca_meta.df3$ST_simp,
col.circle = pca_meta.df3$ST_simp,
title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
repel = T,
pointshape = 16)
## Too few points to calculate an ellipse
# add sample_id number (ID) to median_sample_parameters_all.df
median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
mutate(ID = row.names(.))
rf.df <- median_sample_parameters_all.df %>%
select_if(grepl("OD", names(.)) | grepl("death", names(.))) %>%
mutate(mortality = median_sample_parameters_all.df$mortality) %>%
mutate(ID = median_sample_parameters_all.df$ID) %>%
mutate(ST_simp = median_sample_parameters_all.df$ST_simp) %>%
mutate(sample_id = median_sample_parameters_all.df$sample_id)
# can filter or sample here
rf.df <- rf.df %>%
filter(!is.na(mortality))
rownames(rf.df) <- rf.df$ID
set.seed(100)
# train <- sample(nrow(EM.rf.df), 0.7*nrow(EM.rf.df), replace = FALSE)
# TrainSet <- EM.rf.df[train,]
# ValidSet <- EM.rf.df[-train,]
pred_rf.df <- rf.df %>% select(-mortality, -ID, -ST_simp, -sample_id)
resp_rf.vec <- rf.df$mortality
model1 <- randomForest(x = pred_rf.df, data = EM.rf.df, importance = TRUE, ntree = 100)
model1
##
## Call:
## randomForest(x = pred_rf.df, ntree = 100, importance = TRUE, data = EM.rf.df)
## Type of random forest: unsupervised
## Number of trees: 100
## No. of variables tried at each split: 4
varImpPlot(model1)
prox <- model1$proximity
pam.rf <- pam(prox, 2)
pred <- cbind(pam.rf$clustering, as.character(rf.df$sample_id), as.character(rf.df$ST_simp), as.character(rf.df$mortality))
table(pred[,1], pred[,2])
##
## BPH2700 BPH2701 BPH2702 BPH2703 BPH2704 BPH2705 BPH2706 BPH2707
## 1 1 0 0 0 0 1 1 0
## 2 0 1 1 1 1 0 0 1
##
## BPH2708 BPH2709 BPH2710 BPH2711 BPH2712 BPH2713 BPH2714 BPH2715
## 1 0 0 0 0 1 0 0 0
## 2 1 1 1 1 0 1 1 1
##
## BPH2716 BPH2717 BPH2718 BPH2719 BPH2720 BPH2721 BPH2722 BPH2723
## 1 1 1 0 0 0 1 0 0
## 2 0 0 1 1 1 0 1 1
##
## BPH2724 BPH2725 BPH2726 BPH2727 BPH2728 BPH2729 BPH2730 BPH2731
## 1 0 0 0 0 1 0 1 0
## 2 1 1 1 1 0 1 0 1
##
## BPH2732 BPH2733 BPH2734 BPH2735 BPH2736 BPH2737 BPH2738 BPH2739
## 1 1 1 0 0 0 0 0 0
## 2 0 0 1 1 1 1 1 1
##
## BPH2740 BPH2741 BPH2742 BPH2743 BPH2744 BPH2745 BPH2746 BPH2747
## 1 1 1 0 1 1 0 1 0
## 2 0 0 1 0 0 1 0 1
##
## BPH2748 BPH2749 BPH2751 BPH2752 BPH2753 BPH2754 BPH2755 BPH2756
## 1 0 1 1 1 1 1 1 1
## 2 1 0 0 0 0 0 0 0
##
## BPH2757 BPH2758 BPH2759 BPH2760 BPH2761 BPH2763 BPH2764 BPH2765
## 1 1 0 1 1 0 0 1 0
## 2 0 1 0 0 1 1 0 1
##
## BPH2766 BPH2767 BPH2768 BPH2769 BPH2770 BPH2771 BPH2773 BPH2774
## 1 0 1 1 0 0 0 0 0
## 2 1 0 0 1 1 1 1 1
##
## BPH2775 BPH2776 BPH2777 BPH2778 BPH2779 BPH2780 BPH2781 BPH2782
## 1 0 0 0 0 0 0 0 0
## 2 1 1 1 1 1 1 1 1
##
## BPH2783 BPH2784 BPH2785 BPH2786 BPH2787 BPH2788 BPH2789 BPH2790
## 1 0 0 0 1 1 0 0 0
## 2 1 1 1 0 0 1 1 1
##
## BPH2791 BPH2792 BPH2793 BPH2794 BPH2795 BPH2796 BPH2797 BPH2798
## 1 0 1 1 0 1 1 1 0
## 2 1 0 0 1 0 0 0 1
##
## BPH2799 BPH2800 BPH2801 BPH2802 BPH2803 BPH2804 BPH2805 BPH2806
## 1 1 0 1 1 1 0 1 0
## 2 0 1 0 0 0 1 0 1
##
## BPH2807 BPH2808 BPH2809 BPH2810 BPH2811 BPH2812 BPH2813 BPH2814
## 1 0 0 0 1 0 1 1 1
## 2 1 1 1 0 1 0 0 0
##
## BPH2815 BPH2816 BPH2817 BPH2818 BPH2819 BPH2820 BPH2821 BPH2822
## 1 1 0 1 1 1 1 1 1
## 2 0 1 0 0 0 0 0 0
##
## BPH2823 BPH2824 BPH2825 BPH2827 BPH2828 BPH2829 BPH2830 BPH2831
## 1 0 0 0 0 0 1 1 1
## 2 1 1 1 1 1 0 0 0
##
## BPH2832 BPH2833 BPH2834 BPH2835 BPH2836 BPH2837 BPH2838 BPH2839
## 1 0 1 0 1 1 1 1 1
## 2 1 0 1 0 0 0 0 0
##
## BPH2840 BPH2841 BPH2842 BPH2843 BPH2844 BPH2845 BPH2846 BPH2847
## 1 1 1 0 0 1 1 1 0
## 2 0 0 1 1 0 0 0 1
##
## BPH2848 BPH2849 BPH2850 BPH2851 BPH2852 BPH2853 BPH2854 BPH2855
## 1 1 1 1 0 0 1 0 0
## 2 0 0 0 1 1 0 1 1
##
## BPH2856 BPH2857 BPH2858 BPH2859 BPH2860 BPH2861 BPH2862 BPH2863
## 1 0 0 1 1 1 1 1 1
## 2 1 1 0 0 0 0 0 0
##
## BPH2864 BPH2865 BPH2866 BPH2867 BPH2896 BPH2902 BPH2904 BPH2934
## 1 1 0 0 0 0 0 1 0
## 2 0 1 1 1 1 1 0 1
##
## BPH2938 BPH2942 BPH2943 BPH2949 BPH2962 BPH2963 BPH2976 BPH2977
## 1 0 1 1 1 1 1 0 0
## 2 1 0 0 0 0 0 1 1
##
## BPH2978 BPH2979 BPH2980 BPH2981 BPH2982 BPH3225 BPH3252 BPH3276
## 1 0 0 0 0 0 0 0 0
## 2 1 1 1 1 1 1 1 1
##
## BPH3306 BPH3328 BPH3336 BPH3337 BPH3350 BPH3358 BPH3360 BPH3361
## 1 0 0 1 0 0 1 0 1
## 2 1 1 0 1 1 0 1 0
##
## BPH3367 BPH3383 BPH3384 BPH3394 BPH3396 BPH3412 BPH3416 BPH3423
## 1 0 0 0 0 0 1 1 1
## 2 1 1 1 1 1 0 0 0
##
## BPH3425 BPH3427 BPH3448 BPH3451 BPH3455 BPH3478 BPH3483 BPH3504
## 1 1 0 1 0 1 1 0 1
## 2 0 1 0 1 0 0 1 0
##
## BPH3506 BPH3515 BPH3521 BPH3525 BPH3533 BPH3535 BPH3541 BPH3549
## 1 1 0 1 1 1 0 1 0
## 2 0 1 0 0 0 1 0 1
##
## BPH3560 BPH3563 BPH3572 BPH3594 BPH3617 BPH3627 BPH3644 BPH3646
## 1 1 0 1 0 1 1 1 1
## 2 0 1 0 1 0 0 0 0
##
## BPH3676 BPH3680 BPH3681 BPH3698 BPH3706 BPH3708 BPH3712 BPH3723
## 1 1 1 1 1 0 0 1 1
## 2 0 0 0 0 1 1 0 0
##
## BPH3725 BPH3733 BPH3734 BPH3747 BPH3752 BPH3757
## 1 0 0 1 0 0 0
## 2 1 1 0 1 1 1
table(pred[,1], pred[,3])
##
## - 22 239 45 5 Other
## 1 13 1 12 9 15 62
## 2 18 21 29 11 6 41
table(pred[,1], pred[,4])
##
## Died Survived
## 1 32 80
## 2 31 95
cluster.df <-cbind(pam.rf$clustering, as.character(rf.df$sample_id)) %>% as.data.frame()
colnames(cluster.df) <- c("cluster", "sample_id")
pca_meta.df4 <- rf.df %>%
merge(., cluster.df , by = "sample_id")
pca.df4 <- pca_meta.df4 %>%
select(-sample_id, -cluster, -ST_simp, -mortality, -ID)
pca4 <- dudi.pca(df = pca.df4, scannf = F, nf = 5, scale = T)
fviz_screeplot(pca4, addlabels = T)
fviz_pca_biplot(pca4,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df4$cluster,
title = paste("" ),
repel = T,
pointshape = 16)
fviz_pca_biplot(pca4,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df4$mortality,
title = paste("" ),
repel = T,
pointshape = 16)